In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
In [4]:
data = pd.read_csv("insurance.csv")
In [5]:
data
Out[5]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
... ... ... ... ... ... ... ...
1333 50 male 30.970 3 no northwest 10600.54830
1334 18 female 31.920 0 no northeast 2205.98080
1335 18 female 36.850 0 no southeast 1629.83350
1336 21 female 25.800 0 no southwest 2007.94500
1337 61 female 29.070 0 yes northwest 29141.36030

1338 rows × 7 columns

In [6]:
data.head()
Out[6]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
In [7]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
In [8]:
data.region.value_counts()
Out[8]:
southeast    364
northwest    325
southwest    325
northeast    324
Name: region, dtype: int64
In [9]:
fig = px.histogram(data, x='age', marginal='box', nbins=47, title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show()
In [10]:
fig = px.histogram(data, y='charges', marginal='box', nbins=47, title='Distribution of Charge')
fig.update_layout(bargap=0.1)
fig.show()
In [11]:
#Charges vs Age
plt.figure(figsize = (20,10))
sns.lineplot(x = data.age, y = data.charges, hue= data.sex)
Out[11]:
<AxesSubplot:xlabel='age', ylabel='charges'>
In [12]:
plt.figure(figsize=(15, 10))
plt.title("Charges vs region")
sns.barplot(x='region', y='charges', hue='sex', data=data)
Out[12]:
<AxesSubplot:title={'center':'Charges vs region'}, xlabel='region', ylabel='charges'>
In [13]:
data.dtypes
Out[13]:
age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object
In [14]:
#Converting objects labels into categorical
data[['sex', 'smoker', 'region']].astype('category')
data.dtypes
Out[14]:
age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object
In [15]:
#Converting category labels into numerical using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
label.fit(data.sex.drop_duplicates())
data.sex = label.transform(data.sex)
label.fit(data.smoker.drop_duplicates())
data.smoker = label.transform(data.smoker)
label.fit(data.region.drop_duplicates())
data.region = label.transform(data.region)
data.dtypes
Out[15]:
age           int64
sex           int32
bmi         float64
children      int64
smoker        int32
region        int32
charges     float64
dtype: object
In [16]:
#Linear Regression

from sklearn.model_selection import train_test_split as holdout
from sklearn.linear_model import LinearRegression
from sklearn import metrics

x = data.drop(['charges'], axis = 1)
y = data['charges']
x_train, x_test, y_train, y_test = holdout(x, y, test_size=0.2, random_state=0)
Lin_reg = LinearRegression()
Lin_reg.fit(x_train, y_train)
print(Lin_reg.score(x_test, y_test))
0.7998747145449959
In [17]:
#Ridge Regression

from sklearn.linear_model import Ridge

Ridge = Ridge(alpha=0.5)
Ridge.fit(x_train, y_train)
print(Ridge.score(x_test, y_test))
0.799698963206314
In [18]:
# Lasso Regression

from sklearn.linear_model import Lasso

Lasso = Lasso(alpha=0.2, fit_intercept=True, normalize=False, precompute=False, max_iter=1000,
              tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')
Lasso.fit(x_train, y_train)
print(Lasso.score(x_test, y_test))
0.7998690236224705